{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "X_train = pd.read_csv('features/X_train_features.csv')\n",
    "y_train = pd.read_csv('y_train.csv', index_col='id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "# Imput missing values using the mean of each column (basic : try to find more pertinent)\n",
    "\n",
    "# imput missing values using the k-neighbors imputer (more advanced)\n",
    "from sklearn.impute import KNNImputer\n",
    "\n",
    "\n",
    "# Create the imputer object, with 10 neighbors\n",
    "imputer = KNNImputer(n_neighbors=10, weights='distance')\n",
    "\n",
    "# Fit the imputer object on the train data\n",
    "imputer.fit(X_train)\n",
    "\n",
    "# Impute the missing values on the train and test data\n",
    "X_train = imputer.transform(X_train)\n",
    "\n",
    "# Check that there is no more missing values    \n",
    "print(np.isnan(X_train).sum())\n",
    "print(X_train.shape, y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove features with low variance\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "sel = VarianceThreshold(threshold=0.1)  # remove features with more than 80% variance\n",
    "X_train = sel.fit_transform(X_train, y_train)\n",
    "print(X_train.shape, y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Drop highly correlated features\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Assuming that X_train is your ndarray and it only contains feature columns\n",
    "df = pd.DataFrame(X_train)\n",
    "correlation_matrix = df.corr()\n",
    "\n",
    "# Create a set to hold the correlated columns\n",
    "corr_columns = set()\n",
    "\n",
    "# Iterate over the correlation matrix\n",
    "for i in range(len(correlation_matrix.columns)):\n",
    "    for j in range(i):\n",
    "        # If the correlation between the columns is high, add it to the set\n",
    "        if abs(correlation_matrix.iloc[i, j]) > 0.9:\n",
    "            colname = correlation_matrix.columns[i]\n",
    "            corr_columns.add(colname)\n",
    "\n",
    "# Get the indices of the relevant features\n",
    "relevant_features = [df.columns.get_loc(c) for c in df.columns if c not in corr_columns]\n",
    "\n",
    "X_train = X_train[:, relevant_features]\n",
    "# Print the relevant feature indices\n",
    "print(\"Number of relevant features : \", X_train.shape[1])\n",
    "print(relevant_features)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = y_train.to_numpy().ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select the most relevant features\n",
    "from sklearn.feature_selection import SelectKBest, f_regression\n",
    "\n",
    "# Create the SelectKBest with the mutual info strategy\n",
    "selector = SelectKBest(f_regression, k=300)\n",
    "\n",
    "# Fit the object to the training data\n",
    "selector.fit(X_train, y_train)\n",
    "\n",
    "# Transform the data\n",
    "X_train = selector.transform(X_train)\n",
    "print(X_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_df = pd.DataFrame(y_train, columns=['target'])\n",
    "y_df.to_csv('public/y_train.csv', index_label='id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_df = pd.DataFrame(X_train, index=y_df.index)\n",
    "X_train_df.to_csv('features/X_train_features.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
